# Description ------------------------------------------------------------------
#   This script combines consumption, price, and weather data for PG&E's
#   service area (by zip code).
#   - INPUT: price, consumption, and weather datasets
#   - OUTPUT: zip-by-utility datasets (in DataR/Prices/PGE)
#   - NEXT SCRIPT: buildPanelCommonZips.R

# Exclusions -------------------------------------------------------------------
#   - I dropped SoCal price data from advice letters 3644, 3680, 3695, 3807,
#     4053, and 4061, as they were updated by letters 3660, 3697, 3697, 3810,
#     4055, and 4069, respectively.
#   - I dropped pre-2008 data (PG&E and prices/allowances), as SoCalGas did not
#     share billing data for pre-2009 bills.
#   - I trimmed the shortest 2.5% and longest 2.5% bills (resulted in keeping
#     bills of length between 28-34 days). I did this to omit the first or last
#     bills for a household.

# Setup ------------------------------------------------------------------------
  # Options
  options(stringsAsFactors = FALSE)
  # Packages
  library(readstata13)
  library(magrittr)
  library(data.table)
  library(stringr)
  library(lubridate)
  library(foreach)
  library(doParallel)
  # Directories of joined bill-weather files
  dir_data       <- "S:/Edward/JoinedBillWeather/"
  dir_data_pge   <- paste0(dir_data, "PGE/")
  dir_data_socal <- paste0(dir_data, "SoCal/")
  dir_data_sdge  <- paste0(dir_data, "SDGE/")
  dir_elasticity <- "S:/Edward/Elasticity/"
  dir_csv        <- paste0(dir_elasticity, "DataCsv/")
  dir_rds        <- paste0(dir_elasticity, "DataR/")
  dir_sample     <- paste0(dir_rds, "Prices/SampledSubsets/PGE05Percent/")
  # More options
  options(datatable.print.nrows = 10)
  # Set seed
  set.seed(12345)

# Load relevant datasets -------------------------------------------------------
	# Create vector of zip codes for each utility (based upon existing files)
	pge_zips <- dir_data_pge %>% dir() %>%
		gsub(pattern = "[^0-9]", replacement = "")
	socal_zips <- dir_data_socal %>% dir() %>%
		gsub(pattern = "[^0-9]", replacement = "")
	sdge_zips <- dir_data_sdge %>% dir() %>%
		gsub(pattern = "[^0-9]", replacement = "")
  # Create table of zip codes and presence of utilities
  all_zip_dt <- data.table(zip = unique(c(pge_zips, socal_zips, sdge_zips)))
  all_zip_dt[, `:=`(
    pge   = zip %in% pge_zips,
    sdge  = zip %in% sdge_zips,
    socal = zip %in% socal_zips
    )]
  # Save the table as CSV
  write.csv(x = all_zip_dt,
    file = paste0(dir_csv, "utilityZipCodeMap.csv"),
    row.names = F)
  # Vector of the zips common to PG&E and SoCalGas
  common_zips <- all_zip_dt[pge == T & socal == T]$zip
	# Vector of all unique zips
	all_zips <- unique(c(pge_zips, socal_zips, sdge_zips))
	all_zips <- all_zips[order(all_zips)]
  # Load daily allowance dataset
  pge_allowance <- fread(paste0(dir_csv, "pgeBaselinesMonthly.csv"))
  # Change name of date variable
  setnames(pge_allowance, "month_date", "date_eff_allow")
  # Load price dataset
  pge_prices <- fread(paste0(dir_csv, "pgeTariffsCombined.csv"))
  # Load degree day data
  dd_dt <- readRDS(paste0(dir_rds, "degreeDayNARR.rds"))
  # Load Henry Hub spot price data
  hub_dt <- fread(paste0(dir_csv, "dailyHenryHubPricesWithLags.csv"))

# Data work with degree-day dataset --------------------------------------------
  # Convert data to Date format
  dd_dt[, date := as.Date(date)]
  # Convert degree-day dataset to wide
  dd_ca <- dd_dt[region == "ca"][, region := NULL]
  dd_east <- dd_dt[region == "east"][, region := NULL]
  setnames(dd_ca, paste0("ca_", names(dd_ca)))
  setnames(dd_east, paste0("east_", names(dd_east)))
  setnames(dd_ca, "ca_date", "date")
  setnames(dd_east, "east_date", "date")
  dd_dt <- merge(dd_ca, dd_east, by = "date")
  # Create measures of total degree days for last month, 6 months, and year
  dd_dt[, `:=`(
    ca_6month_hdd =
      ca_tot_hdd_lag_month01 + ca_tot_hdd_lag_month02 +
      ca_tot_hdd_lag_month03 + ca_tot_hdd_lag_month04 +
      ca_tot_hdd_lag_month05 + ca_tot_hdd_lag_month06,
    ca_6month_cdd =
      ca_tot_cdd_lag_month01 + ca_tot_cdd_lag_month02 +
      ca_tot_cdd_lag_month03 + ca_tot_cdd_lag_month04 +
      ca_tot_cdd_lag_month05 + ca_tot_cdd_lag_month06,
    east_6month_hdd =
      east_tot_hdd_lag_month01 + east_tot_hdd_lag_month02 +
      east_tot_hdd_lag_month03 + east_tot_hdd_lag_month04 +
      east_tot_hdd_lag_month05 + east_tot_hdd_lag_month06,
    east_6month_cdd =
      east_tot_cdd_lag_month01 + east_tot_cdd_lag_month02 +
      east_tot_cdd_lag_month03 + east_tot_cdd_lag_month04 +
      east_tot_cdd_lag_month05 + east_tot_cdd_lag_month06
    )]
  dd_dt[, `:=`(
    ca_6month_dd = ca_6month_cdd + ca_6month_hdd,
    east_6month_dd = east_6month_cdd + east_6month_hdd
    )]
  dd_dt[, `:=`(
    ca_year_hdd = ca_6month_hdd +
      ca_tot_hdd_lag_month07 + ca_tot_hdd_lag_month08 +
      ca_tot_hdd_lag_month09 + ca_tot_hdd_lag_month10 +
      ca_tot_hdd_lag_month11 + ca_tot_hdd_lag_month12,
    ca_year_cdd = ca_6month_cdd +
      ca_tot_cdd_lag_month07 + ca_tot_cdd_lag_month08 +
      ca_tot_cdd_lag_month09 + ca_tot_cdd_lag_month10 +
      ca_tot_cdd_lag_month11 + ca_tot_cdd_lag_month12,
    east_year_hdd = east_6month_hdd +
      east_tot_hdd_lag_month07 + east_tot_hdd_lag_month08 +
      east_tot_hdd_lag_month09 + east_tot_hdd_lag_month10 +
      east_tot_hdd_lag_month11 + east_tot_hdd_lag_month12,
    east_year_cdd = east_6month_cdd +
      east_tot_cdd_lag_month07 + east_tot_cdd_lag_month08 +
      east_tot_cdd_lag_month09 + east_tot_cdd_lag_month10 +
      east_tot_cdd_lag_month11 + east_tot_cdd_lag_month12
    )]
  dd_dt[, `:=`(
    ca_year_dd = ca_year_cdd + ca_year_hdd,
    east_year_dd = east_year_cdd + east_year_hdd
    )]
  dd_dt[, `:=`(
    ca_1month_dd = ca_tot_hdd_lag_month01 + ca_tot_cdd_lag_month01,
    east_1month_dd = east_tot_hdd_lag_month01 + east_tot_cdd_lag_month01
    )]
  # Change names
  setnames(dd_dt,
    old = c("ca_tot_hdd_lag_month01", "ca_tot_cdd_lag_month01",
      "east_tot_hdd_lag_month01", "east_tot_cdd_lag_month01"),
    new = c("ca_1month_hdd", "ca_1month_cdd",
      "east_1month_hdd", "east_1month_cdd"))
  # Keep subset of variables
  dd_dt <- dd_dt[, .(date,
    ca_1month_dd, ca_1month_hdd, ca_1month_cdd,
    east_1month_dd, east_1month_hdd, east_1month_cdd,
    ca_6month_dd, ca_6month_hdd, ca_6month_cdd,
    east_6month_dd, east_6month_hdd, east_6month_cdd,
    ca_year_dd, ca_year_hdd, ca_year_cdd,
    east_year_dd, east_year_hdd, east_year_cdd
    )]

# Fix formatting in price/allowance data ---------------------------------------
  # Convert characters to dates where appropriate
  pge_allowance[, `:=`(
    date_eff_allow = date_eff_allow %>% ymd() %>% as.Date(),
    eff_start = eff_start %>% ymd() %>% as.Date(),
    eff_stop = eff_stop %>% ymd() %>% as.Date()
    )]
  pge_prices[, date_eff := date_eff %>% mdy() %>% as.Date()]
  # Subset datasets to 2000 through 2015
  # PGE bill data span 2003-2014; SoCal bill data span 2010-2015
  pge_allowance <- pge_allowance[between(year(date_eff_allow), 2000, 2015)]
  pge_prices <- pge_prices[between(year(date_eff), 2000, 2015)]
  # Remove dollar signs from prices; convert to numeric
  # (left over from original XLS files)
  tmp <- pge_prices[, 3:ncol(pge_prices), with = F]
  for(j in 1:ncol(tmp)) {
    set(x = tmp, j = j, value = gsub("\\$", "", tmp[[j]]) %>% as.numeric())
  }
  pge_prices <- cbind(pge_prices[, 1:2, with = F], tmp); rm(tmp)
  # Drop eff_start, eff_stop, season, and month
  pge_allowance[, c("eff_start", "eff_stop", "season", "month") := NULL]
  # Change name of "territory" to "climate"
  setnames(pge_allowance, "territory", "climate")
  # Change name of "base_" to "allow_"
  setnames(pge_allowance,
    c("base_ind", "base_master"),
    c("allow_ind", "allow_master"))
  # Drop advice number from the price data
  pge_prices[, advice_no := NULL]
  # Change date_eff to date_eff_price
  setnames(pge_prices, "date_eff", "date_eff_price")
  # Change name of non-care PPPS variable
  setnames(pge_prices, "ppps_non_care", "ppps")
  # Drop variables that I do not need
  pge_prices <- pge_prices[, .(date_eff_price, charge_base, charge_excess,
    charge_care_base, charge_care_excess, ppps, ppps_care)]
  # Merge degree-data data with price data
  pge_prices %<>% merge(y = dd_dt, by.x = "date_eff_price", by.y = "date",
    all.x = T, all.y = F, sort = F)
  # Create datasets of dates on which price regimes become effective
  pge_price_dates <- pge_prices[, .(date_eff_price)]
  # Add two leads of effective price date
  setorder(pge_price_dates, date_eff_price)
  pge_price_dates[, c(paste0("date_eff_price_", 1:2, "lead")) :=
    shift(date_eff_price, 1L:2L, NA, "lead")]
  # Create datasets of dates on which price regimes become effective
  pge_allowance_dates <- pge_allowance[, .(date_eff_allow)] %>% unique()
  # Add two leads of effective price date
  setorder(pge_allowance_dates, date_eff_allow)
  pge_allowance_dates[, c(paste0("date_eff_allow_", 1:2, "lead")) :=
    shift(date_eff_allow, 1L:2L, NA, "lead")]
  # Change missing ppps and ppps_care to zeros
  pge_prices[is.na(ppps), ppps := 0]
  pge_prices[is.na(ppps_care), ppps_care := 0]

# Join Henry Hub spot price data with utilities' price data --------------------
  # Convert spot price dates to Date format
  hub_dt[, date := date %>% ymd() %>% as.Date()]
  # Drop the day's spot price (we're keeping the weekly averages)
  hub_dt[, spot_price := NULL]
  # Merge the datasets
  pge_prices %<>% merge(y = hub_dt, by.x = "date_eff_price", by.y = "date",
    all.x = T, all.y = F, sort = F)

# Convert zip code files to R format -------------------------------------------
#  # Start the cluster
#  cl <- makeCluster(6)
#  registerDoParallel(cl)
#  # For each zip code: open the relevant PGE/SoCal/SDGE files; save as .rds
#	foreach(the_zip = all_zips,
#    .packages = c("readstata13", "data.table")) %dopar% {
#      # If there is a PGE file, save it as .rds
#      if(the_zip %in% pge_zips) saveRDS(object =
#        read.dta13(paste0(dir_data_pge, "joined_pge_", the_zip, ".dta")),
#        file = paste0(dir_rds, "JoinedWeather/joined_pge_", the_zip, ".rds"))
#      # If there is a SoCal file, save it as .rds
#      if(the_zip %in% socal_zips) saveRDS(object =
#        read.dta13(paste0(dir_data_socal, "joined_socal_", the_zip, ".dta")),
#        file = paste0(dir_rds, "JoinedWeather/joined_socal_", the_zip, ".rds"))
#      # If there is a SDGE file, save it as .rds
#      if(the_zip %in% sdge_zips) saveRDS(object =
#        read.dta13(paste0(dir_data_sdge, "joined_sdge_", the_zip, ".dta")),
#        file = paste0(dir_rds, "JoinedWeather/joined_sdge_", the_zip, ".rds"))
#    }
#  # Kill cluster
#  stopCluster(cl)

# Open billing files and attach price data -------------------------------------
  # Start the cluster
  cl <- makeCluster(12)
  registerDoParallel(cl)
  # Loop over the zip codes
  foreach(the_zip = pge_zips,
#  foreach(the_zip = common_zips,
    .packages = c("data.table", "magrittr", "lubridate", "stringr")) %dopar% {
    #the_zip <- 93224
    #the_zip <- 93311
    # Load the zip code's PGE utility file
    pge_dt <- readRDS(paste0(dir_rds, "JoinedWeather/joined_pge_",
      the_zip, ".rds")) %>% data.table()
    # Grab temperature bin columns
    pge_weather <- pge_dt[,
      c("begin_date", "end_date", "hh_id", paste0("t_p_", 1:14), "days"),
      with = F]
    # Convert temperature bins from 30-day equivalent to actual numbers of days
# HACK: We will have different numbers of days for weather and bill, since
# my definition of bill length change below (relative to the definition that
# I used in the weather calculations).
    pge_weather[, `:=`(
      t_p_1  = t_p_1  * days / 30,
      t_p_2  = t_p_2  * days / 30,
      t_p_3  = t_p_3  * days / 30,
      t_p_4  = t_p_4  * days / 30,
      t_p_5  = t_p_5  * days / 30,
      t_p_6  = t_p_6  * days / 30,
      t_p_7  = t_p_7  * days / 30,
      t_p_8  = t_p_8  * days / 30,
      t_p_9  = t_p_9  * days / 30,
      t_p_10 = t_p_10 * days / 30,
      t_p_11 = t_p_11 * days / 30,
      t_p_12 = t_p_12 * days / 30,
      t_p_13 = t_p_13 * days / 30,
      t_p_14 = t_p_14 * days / 30
      )][, days := NULL]
    # Drop temperature variables
    pge_dt <- pge_dt[, 1:16, with = F]
    # Drop other unwanted variables
    pge_dt[, c("zip5", "serv_start", "serv_stop", "prem_id", "acct_id",
      "serv_id") := NULL]
    # Drop customers that are not standard residential or CARE residential
    pge_dt <- pge_dt[rate_sch %in% c("G1", "GL1")]
    # Recalculate bill lengths
		# Example: 12/1-12/15 will be 14 days, rather than 15 days
    pge_dt[, days := (end_date - begin_date) %>% as.numeric()]
    # Change "NA" to NA for zip_plus4
    pge_dt[zip_plus4 == "NA", zip_plus4 := NA]
		# Create month and year variables
	  pge_dt[, `:=`(
	    month = str_pad(month(begin_date), 2, "left", 0),
	    year = year(begin_date))]
	  # Create year-month variable
	  pge_dt[, ym :=
	    paste0(year, month)]

    # Rolling merge price dates onto utility dataset ---------------------------
    # Add copies of columns used in the rolling merge
    pge_dt[, begin_date_copy := begin_date]
    pge_price_dates[, date_eff_price_copy := date_eff_price]
    pge_allowance_dates[, date_eff_allow_copy := date_eff_allow]
    # Set keys for the rolling merge
    setkey(pge_dt, begin_date_copy, hh_id)
    setkey(pge_price_dates, date_eff_price_copy)
    setkey(pge_allowance_dates, date_eff_allow_copy)
    # The rolling merge: merge using the begin date of the bill and the price
    # regime that precedes the begin date
    pge_dt <- pge_price_dates[pge_dt, roll = Inf]
    pge_dt[, date_eff_price_copy := NULL]
    # The same rolling merge for allowances
    pge_dt[, begin_date_copy := begin_date]
    setkey(pge_dt, begin_date_copy, hh_id)
    pge_dt <- pge_allowance_dates[pge_dt, roll = Inf]
    pge_dt[, date_eff_allow_copy := NULL]
    # Change names of date_eff_price/allow to date_price/allow
    setnames(pge_dt,
      old = paste0("date_eff_price", c("", "_1lead", "_2lead")),
      new = paste0("date_price", 1:3))
    setnames(pge_dt,
      old = paste0("date_eff_allow", c("", "_1lead", "_2lead")),
      new = paste0("date_allow", 1:3))

    # Merge billing data with price and allowance data -------------------------
    # For each of the three "date_price"s
    #   (1) Change names in pge_allowance
    #   (2) Change names in pge_prices
    #   (3) Merge the billing and allowance data
    #   (4) Merge the billing and price data
    # PGE: date_price1
    setnames(pge_allowance, paste0(names(pge_allowance), 1))
    setnames(pge_prices, paste0(names(pge_prices), 1))
    pge_dt %<>% merge(y = pge_allowance,
      by.x = c("climate", "date_allow1"),
			by.y = c("climate1", "date_eff_allow1"),
      all.x = T, all.y = F)
    pge_dt %<>% merge(y = pge_prices,
      by.x = "date_price1",
      by.y = "date_eff_price1",
      all.x = T, all.y = F)
    # PGE: date_price2
    setnames(pge_allowance, gsub("[1]$", "2", names(pge_allowance)))
    setnames(pge_prices, gsub("[1]$", "2", names(pge_prices)))
    pge_dt %<>% merge(y = pge_allowance,
      by.x = c("climate", "date_allow2"),
			by.y = c("climate2", "date_eff_allow2"),
      all.x = T, all.y = F)
    pge_dt %<>% merge(y = pge_prices,
      by.x = "date_price2",
			by.y = "date_eff_price2",
			all.x = T, all.y = F)
    # PGE: date_price3
    setnames(pge_allowance, gsub("[2]$", "3", names(pge_allowance)))
    setnames(pge_prices, gsub("[2]$", "3", names(pge_prices)))
    pge_dt %<>% merge(y = pge_allowance,
      by.x = c("climate", "date_allow3"),
			by.y = c("climate3", "date_eff_allow3"),
      all.x = T, all.y = F)
    pge_dt %<>% merge(y = pge_prices,
      by.x = "date_price3",
			by.y = "date_eff_price3",
			all.x = T, all.y = F)
    # Return PGE column names to unnumbered (will be used in other iterations)
    setnames(pge_allowance, gsub("[3]$", "", names(pge_allowance)))
    setnames(pge_prices, gsub("[3]$", "", names(pge_prices)))

    # Determine days spent in each price regime --------------------------------
    # Alphabetize columns
    setcolorder(pge_dt, order(names(pge_dt)))
    # Calculate the number of days within each of the unique periods.
    # NOTE: The bill can end in any of the three regimes, which affects how the
    # we calculate the number of days in the regimes.
    # First: Bills that end before the second price regime begins
    pge_dt[end_date <= date_price2, `:=`(
      days1 = (end_date - begin_date) %>% as.numeric(),
      days2 = 0,
      days3 = 0
      )]
    # Second: Bills that end during the second price regime
    pge_dt[(end_date > date_price2) & (end_date <= date_price3), `:=`(
      days1 = (date_price2 - begin_date) %>% as.numeric(),
      days2 = (end_date - date_price2) %>% as.numeric(),
      days3 = 0
      )]
    # Third: Bills that end during the third price regime
    pge_dt[end_date > date_price3, `:=`(
      days1 = (date_price2 - begin_date) %>% as.numeric(),
      days2 = (date_price3 - date_price2) %>% as.numeric(),
      days3 = (end_date - date_price3) %>% as.numeric()
      )]
    # Alphabetize column order
    setcolorder(pge_dt, order(names(pge_dt)))
    # Order rows
    setorder(pge_dt, hh_id, begin_date)

    # Calculate prices and fees ------------------------------------------------
    # NOTE: This process differs between PG&E and SoCalGas.
    # Calculate the share of the bill that fell in each period
    pge_dt[, `:=`(
      share1 = days1 / days,
      share2 = days2 / days,
      share3 = days3 / days
      )]
    # Calculate baseline allowance for the bill
    # NOTE: I assume all bills are for individual households (not multifamily)
    pge_dt[, `:=`(
      allowance1 = days1 * allow_ind1,
      allowance2 = days2 * allow_ind2,
      allowance3 = days3 * allow_ind3
      )]
    # Allocate each period's share of 'thm': PG&E allocates total therms ('thm')
    # into months, using time as weights ('share' here)
    pge_dt[, `:=`(
      thm1 = share1 * thm,
      thm2 = share2 * thm,
      thm3 = share3 * thm
      )]
    # If 'share2' (or 'share3') is 0, then set 'thm2' (or thm3) to NA
    pge_dt[share2 == 0, thm2 := NA]
    pge_dt[share3 == 0, thm3 := NA]
    # Determine whether the household exceeded their baseline allowance
    pge_dt[, `:=`(
      exceeded1 = thm1 > allowance1,
      exceeded2 = thm2 > allowance2,
      exceeded3 = thm3 > allowance3
      )]
    # By how much did the household exceed their baseline? This quantity gives
    # consumption on the second-tier price
    pge_dt[, `:=`(
      thm_excess1 = thm1 - allowance1,
      thm_excess2 = thm2 - allowance2,
      thm_excess3 = thm3 - allowance3
      )]
    pge_dt[exceeded1 == F, thm_excess1 := 0]
    pge_dt[exceeded2 == F, thm_excess2 := 0]
    pge_dt[exceeded3 == F, thm_excess3 := 0]
    # Add the number of therms on which the household paid baseline rate
    pge_dt[exceeded1 == T, thm_base1 := allowance1]
    pge_dt[exceeded1 == F, thm_base1 := thm1]
    pge_dt[exceeded2 == T, thm_base2 := allowance2]
    pge_dt[exceeded2 == F, thm_base2 := thm2]
    pge_dt[exceeded3 == T, thm_base3 := allowance3]
    pge_dt[exceeded3 == F, thm_base3 := thm3]
    # NOTE: At this point in the SoCalGas script, we calculate weighted prices
    # however, PG&E calculated prices slightly differently, so we need a few
    # more steps before we get to prices

    # Weight degree days and spot prices ---------------------------------------
    # Calculate weighted averages of degree-day/temperature tallies
    pge_dt[, `:=`(
      ca_year_dd  = (days1 * ca_year_dd1 +
        days2 * ca_year_dd2 + days3 * ca_year_dd3) / days,
      ca_year_cdd = (days1 * ca_year_cdd1 +
        days2 * ca_year_cdd2 + days3 * ca_year_cdd3) / days,
      ca_year_hdd = (days1 * ca_year_hdd1 +
        days2 * ca_year_hdd2 + days3 * ca_year_hdd3) / days,
      east_year_dd  = (days1 * east_year_dd1 +
        days2 * east_year_dd2 + days3 * east_year_dd3) / days,
      east_year_cdd = (days1 * east_year_cdd1 +
        days2 * east_year_cdd2 + days3 * east_year_cdd3) / days,
      east_year_hdd = (days1 * east_year_hdd1 +
        days2 * east_year_hdd2 + days3 * east_year_hdd3) / days
      )]
    # Calculate period-length-weighed averages of weekly spot price variables
    pge_dt[, `:=`(
      spot_price_1week = (days1 * spot_price_lag_1_71 +
        days2 * spot_price_lag_1_72 + days3 * spot_price_lag_1_73) / days,
      spot_price_2week = (days1 * spot_price_lag_8_141 +
        days2 * spot_price_lag_8_142 + days3 * spot_price_lag_8_143) / days,
      spot_price_3week = (days1 * spot_price_lag_15_211 +
        days2 * spot_price_lag_15_212 + days3 * spot_price_lag_15_213) / days,
      spot_price_4week = (days1 * spot_price_lag_22_281 +
        days2 * spot_price_lag_22_282 + days3 * spot_price_lag_22_283) / days,
      spot_price_5week = (days1 * spot_price_lag_29_351 +
        days2 * spot_price_lag_29_352 + days3 * spot_price_lag_29_353) / days,
      spot_price_6week = (days1 * spot_price_lag_36_421 +
        days2 * spot_price_lag_36_422 + days3 * spot_price_lag_36_423) / days,
      NULL = NULL)]
      # Keep the degree day and price variables from the first price and
      # allowance regimes (the regimes in place when the bill began). I use
      # these values in an alternate empirical strategy.
      pge_dt[, `:=`(
        charge_base_init        = charge_base1,
        charge_care_base_init   = charge_care_base1,
        charge_excess_init      = charge_excess1,
        charge_care_excess_init = charge_care_excess1,
        ca_year_dd_init         = ca_year_dd1,
        ca_year_cdd_init        = ca_year_cdd1,
        ca_year_hdd_init        = ca_year_hdd1,
        east_year_dd_init       = east_year_dd1,
        east_year_cdd_init      = east_year_cdd1,
        east_year_hdd_init      = east_year_hdd1,
        spot_price_1week_init   = spot_price_lag_1_71,
        spot_price_2week_init   = spot_price_lag_8_141,
        spot_price_3week_init   = spot_price_lag_15_211,
        spot_price_4week_init   = spot_price_lag_22_281,
        spot_price_5week_init   = spot_price_lag_29_351,
        spot_price_6week_init   = spot_price_lag_36_421,
        NULL = NULL)]

    # Calculate (sub-) bills ---------------------------------------------------
    # NOTE 1: This section of the script is very different from the section in
    # SoCal's script, due to the different ways of dealing with bills that
    # span multiple calendar months.
    # NOTE 2: It appears as though the PG&E "rev" variable excludes PPPS-related
    # charges. SoCal's data has the actual total bill.
    # The CARE bills (revenue) for each sub-bill periods (month)
    pge_dt[, `:=`(
      rev_care1 =
        thm_base1 * charge_care_base1 + thm_excess1 * charge_care_excess1,
      rev_care2 =
        thm_base2 * charge_care_base2 + thm_excess2 * charge_care_excess2,
      rev_care3 =
        thm_base3 * charge_care_base3 + thm_excess3 * charge_care_excess3
      )]
    # The non-CARE bills (revenue) for each sub-bill periods (month)
    pge_dt[, `:=`(
      rev_noncare1 =
        thm_base1 * charge_base1 + thm_excess1 * charge_excess1,
      rev_noncare2 =
        thm_base2 * charge_base2 + thm_excess2 * charge_excess2,
      rev_noncare3 =
        thm_base3 * charge_base3 + thm_excess3 * charge_excess3
      )]
    # If a period has 0 share, then change the period's bills to zero
    pge_dt[share2 == 0, `:=`(rev_noncare2 = 0, rev_care2 = 0)]
    pge_dt[share3 == 0, `:=`(rev_noncare3 = 0, rev_care3 = 0)]
    # Calculate the total bill for CARE and non-CARE
    pge_dt[, `:=`(
      rev_care = rev_care1 + rev_care2 + rev_care3,
      rev_noncare = rev_noncare1 + rev_noncare2 + rev_noncare3
      )]
    # Calculate the error for each revenue calculation
    pge_dt[, `:=`(
      rev_care_error = rev - rev_care,
      rev_noncare_error = rev - rev_noncare
      )]
    # Find the minimum absolute error for each bill
    pge_dt[, row_no := 1:.N][,
      rev_error_min := min(abs(rev_noncare_error), abs(rev_care_error)),
      by = row_no][, row_no := NULL]
    # Determine if the HH is a CARE HH using nearest bill estimate
    pge_dt[, care := rev_error_min == abs(rev_care_error)]
    # Calculate the percentage of observations that I classify as CARE
    pge_dt[, care_pct := sum(care == T) / .N, by = hh_id]
    # Use the CARE classification to determine the predicted rev and errors
    pge_dt[care == T, rev_hat := rev_care]
    pge_dt[care == T, rev_hat1 := rev_care1]
    pge_dt[care == T, rev_hat2 := rev_care2]
    pge_dt[care == T, rev_hat3 := rev_care3]
    pge_dt[care == F, rev_hat := rev_noncare]
    pge_dt[care == F, rev_hat1 := rev_noncare1]
    pge_dt[care == F, rev_hat2 := rev_noncare2]
    pge_dt[care == F, rev_hat3 := rev_noncare3]
    pge_dt[, rev_error := rev - rev_hat]
    # Delete unwanted columns
    pge_dt[, c("rev_care", "rev_care_error",
      "rev_care1", "rev_care2", "rev_care3",
      "rev_noncare", "rev_noncare_error",
      "rev_noncare1", "rev_noncare2", "rev_noncare3",
      "rev_error_min") := NULL]
    # Find percentage error
    pge_dt[, pct_error := rev_error / rev]
    pge_dt[rev == 0, pct_error := 0]
    # Flag bills where the percentage error is more than 5 percent
    pge_dt[, bill_flag := abs(pct_error) > 0.05]
    # Extend a lag of the flag (to know if the previous bill was flagged)
    setorder(pge_dt, hh_id, begin_date)
    pge_dt[,
      bill_flag_lag1 := shift(bill_flag, n = 1L, fill = NA, type = "lag"),
      by = hh_id]
    # Count the number of flags within a household
    pge_dt[, n_flags := sum(bill_flag), hh_id]
    # Calculate the percentage of flags within a household
    pge_dt[, pct_flags := n_flags / .N, hh_id]

    # Adjust CARE status for PG&E's winter savings programs --------------------
    # NOTE: This adjustment is only necessary for PG&E
    # Create 4 lags and 4 leads of CARE status
    setorder(pge_dt, hh_id, begin_date)
  	pge_dt[, paste0("care_lag", 1:4) :=
      shift(care, n = c(1L:4L), fill = NA, type = "lag", give.names = F),
      by = hh_id]
    setorder(pge_dt, hh_id, begin_date)
  	pge_dt[, paste0("care_lead", 1:4) :=
      shift(care, n = c(1L:4L), fill = NA, type = "lead", give.names = F),
      by = hh_id]
    # Create two variables combining the lags and leads of CARE status
    # NOTE: The goal of theses sums to avoid calling households CARE when
    # in fact the households are receiving credit from PG&E's winter gas
    # savings programs. The relevant months are January and February.
    # The motivation for my omitted lag/lead structure:
    # AUG SEP OCT NOV DEC JAN FEB MAR APR MAY JUN JUL
    #      Y   Y   Y   Y   |   -   Y   Y   Y
    #          Y   Y   Y   -   |   Y   Y   Y   Y
    # A webpage for the program:
    #  http://www.pgecurrents.com/2013/01/15/more-than-2-million-...
    #  pge-customers-on-track-for-winter-gas-savings-credits/
    # Create the variable for January: sum the number of CARE instances
    # in the relevant leads and lags; then divide by the non-NA
    pge_dt[, care_pct_jan :=
      (
        ifelse(is.na(care_lag1), 0, care_lag1) +
        ifelse(is.na(care_lag2), 0, care_lag2) +
        ifelse(is.na(care_lag3), 0, care_lag3) +
        ifelse(is.na(care_lag4), 0, care_lag4) +
        ifelse(is.na(care_lead2), 0, care_lead2) +
        ifelse(is.na(care_lead3), 0, care_lead3) +
        ifelse(is.na(care_lead4), 0, care_lead4)
      ) / (
        (!is.na(care_lag1)) +
        (!is.na(care_lag2)) +
        (!is.na(care_lag3)) +
        (!is.na(care_lag4)) +
        (!is.na(care_lead2)) +
        (!is.na(care_lead3)) +
        (!is.na(care_lead4))
      )]
    # Create the variable for February: sum the number of CARE instances in the
    # relevant leads and lags; then divide by the non-NA
    pge_dt[, care_pct_feb :=
      (
        ifelse(is.na(care_lag2), 0, care_lag2) +
        ifelse(is.na(care_lag3), 0, care_lag3) +
        ifelse(is.na(care_lag4), 0, care_lag4) +
        ifelse(is.na(care_lead1), 0, care_lead1) +
        ifelse(is.na(care_lead2), 0, care_lead2) +
        ifelse(is.na(care_lead3), 0, care_lead3) +
        ifelse(is.na(care_lead4), 0, care_lead4)
      ) / (
        (!is.na(care_lag2)) +
        (!is.na(care_lag3)) +
        (!is.na(care_lag4)) +
        (!is.na(care_lead1)) +
        (!is.na(care_lead2)) +
        (!is.na(care_lead3)) +
        (!is.na(care_lead4))
      )]
    # Update January and February specifications of CARE
# NOTE: We could choose an alternate threshold (currently 0.5)
    pge_dt[(month == "01") & (care_pct_jan > 0.5) & (bill_flag == T),
      care := T]
    pge_dt[(month == "01") & (care_pct_jan < 0.5) & (bill_flag == T),
      care := F]
    pge_dt[(month == "01") & (care_pct_jan == 0.5) & (bill_flag == T),
      care := care]
    pge_dt[(month == "02") & (care_pct_feb > 0.5) & (bill_flag == T),
      care := T]
    pge_dt[(month == "02") & (care_pct_feb < 0.5) & (bill_flag == T),
      care := F]
    pge_dt[(month == "02") & (care_pct_feb == 0.5) & (bill_flag == T),
      care := care]

    # Add minimum transportation charge ----------------------------------------
    # PG&E has a minimum charge of 0.09863 dollars per day, while SoCalGas uses
    # their customer charge to achieve a minimum charge.
    # The goal here: find the PG&E bills below the threshold. I separate by
    # households that look like CARE household and those that do not using
    # 4 lags (excluding lag #1) and 4 leads of the CARE variable.
    # Once we find the housholds, increase the bill to the minimum.
    # We need to do this calcualtion for each month of the bill.
    # Care households:
    pge_dt[(care_pct_feb >= 0.5) & (rev_hat1 < days1 * 0.09863),
      rev_hat1 := days1 * 0.09863]
    pge_dt[(care_pct_feb >= 0.5) & (rev_hat2 < days2 * 0.09863),
      rev_hat2 := days2 * 0.09863]
    pge_dt[(care_pct_feb >= 0.5) & (rev_hat3 < days3 * 0.09863),
      rev_hat3 := days3 * 0.09863]
    # Non-CARE households
    pge_dt[(care_pct_feb < 0.5) & (rev_hat1 < days1 * 0.09863),
      rev_hat1 := days1 * 0.09863]
    pge_dt[(care_pct_feb < 0.5) & (rev_hat2 < days2 * 0.09863),
      rev_hat2 := days2 * 0.09863]
    pge_dt[(care_pct_feb < 0.5) & (rev_hat3 < days3 * 0.09863),
      rev_hat3 := days3 * 0.09863]

    # Update revenue and error calculations ------------------------------------
    # Recalculate rev_hat now that we've changed some month's rev_hat values
    pge_dt[, rev_hat := rev_hat1 + rev_hat2 + rev_hat3]
    # Now update rev_error
    pge_dt[, rev_error := rev - rev_hat]
    # Update pct_error accordingly
    pge_dt[, pct_error := rev_error / rev]
    pge_dt[rev == 0, pct_error := 0]
    # Flag bills where the percentage error is more than 5 percent
    pge_dt[, bill_flag := abs(pct_error) > 0.05]
    # Extend a lag of the flag (to know if the previous bill was flagged)
    setorder(pge_dt, hh_id, begin_date)
    pge_dt[,
      bill_flag_lag1 := shift(bill_flag, n = 1L, fill = NA, type = "lag"),
      by = hh_id]
    # Count the number of flags within a household
    pge_dt[, n_flags := sum(bill_flag), hh_id]
    # Calculate the percentage of flags within a household
    pge_dt[, pct_flags := n_flags / .N, hh_id]

    # Calculate prices ---------------------------------------------------------
    # Determine the relevant PPPS rate for each month
    pge_dt[care == T, ppps_charge1 := ppps_care1]
    pge_dt[care == T, ppps_charge2 := ppps_care2]
    pge_dt[care == T, ppps_charge3 := ppps_care3]
    pge_dt[care == F, ppps_charge1 := ppps1]
    pge_dt[care == F, ppps_charge2 := ppps2]
    pge_dt[care == F, ppps_charge3 := ppps3]
    # Take time-weighted average of PPPS charge
    pge_dt[, ppps_charge :=
      ifelse(days1 > 0, days1/days * ppps_charge1, 0) +
      ifelse(days1 > 0, days1/days * ppps_charge1, 0) +
      ifelse(days1 > 0, days1/days * ppps_charge1, 0)
      ]
    # Calculate total monthly bills (revenue plus PPPS per therm charge);
    # Recall that PGE "rev" does not yet include PPPS charges, while SoCal
    # already includes PPPS.
    pge_dt[, total_bill1 := rev_hat1 + thm1 * ppps_charge1]
    pge_dt[, total_bill2 := rev_hat2 + thm2 * ppps_charge2]
    pge_dt[, total_bill3 := rev_hat3 + thm3 * ppps_charge3]
    # Now the full bill
    pge_dt[, total_bill :=
      total_bill1 +
      ifelse(is.na(total_bill2), 0, total_bill2) +
      ifelse(is.na(total_bill3), 0, total_bill3)
      ]
    # Calculate average prices: total bill divided by therms, where total
    # bill includes PPPS charges.
    pge_dt[, price_avg := total_bill / thm]
    # Create time-weighted averages of base and excess price
    # NOTE: Necessary because a single therm is distributed across all months
    pge_dt[, `:=`(
      charge_base =
        days1/days * charge_base1 +
        days2/days * charge_base2 +
        days3/days * charge_base3,
      charge_care_base =
        days1/days * charge_care_base1 +
        days2/days * charge_care_base2 +
        days3/days * charge_care_base3,
      charge_excess =
        days1/days * charge_excess1 +
        days2/days * charge_excess2 +
        days3/days * charge_excess3,
      charge_care_excess =
        days1/days * charge_care_excess1 +
        days2/days * charge_care_excess2 +
        days3/days * charge_care_excess3
      )]
    # Create variables for the baseline and excess prices
    pge_dt[care == T, price_base := charge_care_base]
    pge_dt[care == F, price_base := charge_base]
    pge_dt[care == T, price_excess := charge_care_excess]
    pge_dt[care == F, price_excess := charge_excess]
    # Likewise for "initial" prices
    pge_dt[care == T, price_base_init := charge_care_base_init]
    pge_dt[care == F, price_base_init := charge_base_init]
    pge_dt[care == T, price_excess_init := charge_care_excess_init]
    pge_dt[care == F, price_excess_init := charge_excess_init]
    # Likewise for monthly prices
    pge_dt[care == T, price_base1 := charge_care_base1]
    pge_dt[care == F, price_base1 := charge_base1]
    pge_dt[care == T, price_excess1 := charge_care_excess1]
    pge_dt[care == F, price_excess1 := charge_excess1]
    pge_dt[care == T, price_base2 := charge_care_base2]
    pge_dt[care == F, price_base2 := charge_base2]
    pge_dt[care == T, price_excess2 := charge_care_excess2]
    pge_dt[care == F, price_excess2 := charge_excess2]
    pge_dt[care == T, price_base3 := charge_care_base3]
    pge_dt[care == F, price_base3 := charge_base3]
    pge_dt[care == T, price_excess3 := charge_care_excess3]
    pge_dt[care == F, price_excess3 := charge_excess3]
    # Determine marginal prices within each month
    pge_dt[, `:=`(
      price_mrg1 = price_base1 * (1 - exceeded1) + price_excess1 * exceeded1,
      price_mrg2 = price_base2 * (1 - exceeded2) + price_excess2 * exceeded2,
      price_mrg3 = price_base3 * (1 - exceeded3) + price_excess3 * exceeded3
      )]
# HACK: This part is a bit challenging: in the same bill, a household can
# exceed its allowance in one month and not in another month, which means
# the household pays the second-tier price in one month and the first-tier
# price in another month—all within the same bill.
# I take the time-weighted average of each month's marginal price.
    # Find time-weighted average of month's marginal prices
    pge_dt[, price_mrg :=
      ifelse(days1 > 0, days1/days * price_mrg1, 0) +
      ifelse(days2 > 0, days2/days * price_mrg2, 0) +
      ifelse(days3 > 0, days3/days * price_mrg3, 0)
      ]
    # Calculate average marginal price (therm-weighted-average of mrg. prices)
    # First for each month in the bill:
    pge_dt[, `:=`(
      price_avg_mrg1 =
        thm_base1/thm1 * price_base1 + thm_excess1/thm1 * price_excess1,
      price_avg_mrg2 =
        thm_base2/thm2 * price_base2 + thm_excess2/thm2 * price_excess2,
      price_avg_mrg3 =
        thm_base3/thm3 * price_base3 + thm_excess3/thm3 * price_excess3
      )]
    # Now for the whole bill, weighting by days in each month
    pge_dt[, price_avg_mrg :=
      ifelse(days1 > 0, days1/days * price_avg_mrg1, 0) +
      ifelse(days2 > 0, days2/days * price_avg_mrg2, 0) +
      ifelse(days3 > 0, days3/days * price_avg_mrg3, 0)
      ]
    # Update preceding calculations with the PPPS volumetric charges:
    # Baseline and excess prices
    pge_dt[, price_base_ppps := price_base + ppps_charge]
    pge_dt[, price_excess_ppps := price_excess + ppps_charge]
    # "Initial" prices
    pge_dt[, price_base_init_ppps := price_base_init + ppps_charge]
    pge_dt[, price_excess_init_ppps := price_excess_init + ppps_charge]
    # Marginal prices
    pge_dt[, price_mrg_ppps := price_mrg + ppps_charge]
    # Average marginal price
    pge_dt[, price_avg_mrg := price_avg_mrg + ppps_charge]

    # Calculate total bill -----------------------------------------------------
    # Enforce PG&E's minimum bill charge
    pge_dt[(care_pct_feb >= 0.5) & (thm * charge_care_base < days * 0.09863),
      total_bill := days * 0.09863]
    pge_dt[(care_pct_feb < 0.5) & (thm * charge_base < days * 0.09863),
      total_bill := days * 0.09863]

    # Add misc. variables ------------------------------------------------------
    # Add utility (helpful upon full merge)
		pge_dt[, utility := "pge"]
    # Add utility and zip code to the household ID (and pad exiting ID)
    pge_dt[, hh_id :=
      paste0("pge", the_zip, str_pad(hh_id, 8, "left", 0))]
    pge_weather[, hh_id :=
      paste0("pge", the_zip, str_pad(hh_id, 8, "left", 0))]
    # Fix names of temperature bins (pad bin number with zeros)
    old_bins <- pge_weather %>% names() %>% grep("t_p_", ., value = T)
    new_bins <- old_bins %>% gsub("t_p_", "", .) %>% str_pad(2, "left", 0)
    new_bins %<>% paste0("t_p_", .)
    setnames(pge_weather, old = old_bins, new = new_bins)
    # Calculate heating degree days within bill
# HACK: These intra-bill HDDs are approximate
    # NOTE: See Eds_excellent_adventure.do for explanation of bins
    pge_weather[, bill_hdd :=
      t_p_01 * (65 - 24) +
      t_p_02 * (65 - (35 - 24) / 2) +
      t_p_03 * (65 - (40 - 35) / 2) +
      t_p_04 * (65 - (47 - 40) / 2) +
      t_p_05 * (65 - (51 - 47) / 2) +
      t_p_06 * (65 - (55 - 51) / 2) +
      t_p_07 * (65 - (59 - 55) / 2) +
      t_p_08 * (65 - (63 - 59) / 2)]
    # Merge weather data back onto the billing data
    pge_dt %<>% merge(., y = pge_weather,
      by = c("hh_id", "begin_date", "end_date"),
      all.x = T, all.y = F, sort = F)
	  # Create household-month FEs
	  pge_dt[, hh_month := paste0(hh_id, month)]
    # Create zip-9 variable
	  # NOTE: we are missing many of the 4-digit extensions of the zip;
	  # for now, they get their own 4-digit extension (e.g. 94702NA)
	  pge_dt[, zip9 := paste0(zip, zip_plus4)]
    # Create therms per day variable
    pge_dt[, thm_day := thm / days]
    # Aggregate the bill's allowances
    pge_dt[, allowance := allowance1 + allowance2 + allowance3]

    # Drop unwanted variables --------------------------------------------------
    # Find variables we want drop: they are the monthly variables that end in
    # 1, 2, or 3 (and whose penultimate character is non-numeric); drop
    # "bill_flag_lag1" from the list
    to_drop <- pge_dt %>%
      names() %>%
      grep("[^0-9][1-3]$", ., value = T) %>%
      grep("bill_flag_lag1", ., value = T, invert = T)
    # Drop the variables
    pge_dt[, (to_drop) := NULL]
    # Clean up
    rm(to_drop); gc()

    # Add lags (and leads) for simulated instruments ---------------------------
    # Add thm lags for simulated instruments (10 to 14 periods)
    setorder(pge_dt, hh_id, begin_date)
		pge_dt[, paste0("thm_lag", 10:14) :=
	    shift(thm, n = c(10L:14L), fill = NA, type = "lag", give.names = F),
	    by = hh_id]
    # Add days lags for simulated instruments (10 to 14 periods)
    setorder(pge_dt, hh_id, begin_date)
		pge_dt[, paste0("days_lag", 10:14) :=
	    shift(days, n = c(10L:14L), fill = NA, type = "lag", give.names = F),
	    by = hh_id]

    # Construct simulated instruments ------------------------------------------
    # Determine which lagged periods the HH exceeded the current allowance
    pge_dt[, `:=`(
      exceeded_sim_10 = (thm_lag10 / days_lag10) > (allowance / days),
      exceeded_sim_11 = (thm_lag11 / days_lag11) > (allowance / days),
      exceeded_sim_12 = (thm_lag12 / days_lag12) > (allowance / days),
      exceeded_sim_13 = (thm_lag13 / days_lag13) > (allowance / days),
      exceeded_sim_14 = (thm_lag14 / days_lag14) > (allowance / days)
      )]
    # Pct the HH exceeded their allowance using lags 10-14 and 11-13
    pge_dt[, `:=`(
      exceeded_sim_10_14 = (
        ifelse(is.na(exceeded_sim_10), 0, exceeded_sim_10) +
        ifelse(is.na(exceeded_sim_11), 0, exceeded_sim_11) +
        ifelse(is.na(exceeded_sim_12), 0, exceeded_sim_12) +
        ifelse(is.na(exceeded_sim_13), 0, exceeded_sim_13) +
        ifelse(is.na(exceeded_sim_14), 0, exceeded_sim_14)
      ) / (
        (!is.na(exceeded_sim_10)) +
        (!is.na(exceeded_sim_11)) +
        (!is.na(exceeded_sim_12)) +
        (!is.na(exceeded_sim_13)) +
        (!is.na(exceeded_sim_14))
      ),
      exceeded_sim_11_13 = (
        ifelse(is.na(exceeded_sim_11), 0, exceeded_sim_11) +
        ifelse(is.na(exceeded_sim_12), 0, exceeded_sim_12) +
        ifelse(is.na(exceeded_sim_13), 0, exceeded_sim_13)
      ) / (
        (!is.na(exceeded_sim_11)) +
        (!is.na(exceeded_sim_12)) +
        (!is.na(exceeded_sim_13))
      ) )]
    # Create the actual simulated instrument variables (10-14 and 11-13)
    pge_dt[, `:=`(
      mrg_sim_iv_10_14 =
        (exceeded_sim_10_14 <= 0.5) * price_base +
        (exceeded_sim_10_14 > 0.5) * price_excess,
      mrg_sim_iv_11_13 =
        (exceeded_sim_11_13 <= 0.5) * price_base +
        (exceeded_sim_11_13 > 0.5) * price_excess
      )]

    # Add more lags and leads --------------------------------------------------
    # Temperature bin names
    bin_names <- paste0("t_p_", str_pad(1:14, 2, "left", 0))
    # Names of variables to lag/lead
    var_names <- c(
      # Non-price variables
      "allowance", "thm", "thm_day",
      "days", "month", bin_names,
      "bill_hdd",
      "east_year_hdd", "ca_year_hdd",
      "east_year_cdd", "ca_year_cdd",
      "east_year_dd", "ca_year_dd",
      "total_bill", "bill_flag", "care",
      # Price variables
      "price_avg", "price_avg_mrg",
      "price_base", "price_base_ppps", "price_base_init",
      "price_excess", "price_excess_ppps", "price_excess_init",
      "ppps_charge",
      "price_mrg", "price_mrg_ppps",
      "mrg_sim_iv_10_14", "mrg_sim_iv_11_13",
      "spot_price_1week","spot_price_1week_init",
      "spot_price_2week", "spot_price_2week_init",
      "spot_price_3week", "spot_price_3week_init")
    # Define the number of lags and leads
    the_lags <- c(1:15)
    the_leads <- 1:3
    # Names for new lag/lead variables
    lag_names <- paste0(rep(var_names, each = length(the_lags)),
      "_lag", the_lags)
    lead_names <- paste0(rep(var_names, each = length(the_leads)),
      "_lead", the_leads)
    # Drop variables whose names overlap with the list of lags/leads
    pge_dt[, intersect(c(lag_names, lead_names), names(pge_dt)) := NULL]
    # Order the rows by household and bills' start dates
    setorder(pge_dt, "hh_id", "begin_date")
    # Create lags
    pge_dt[, (lag_names) := shift(.SD, the_lags, NA, "lag"),
      by = hh_id, .SDcols = var_names]
    # Create leads
    pge_dt[, (lead_names) := shift(.SD, the_leads, NA, "lead"),
      by = hh_id, .SDcols = var_names]

    # Save full dataset --------------------------------------------------------
    # Order columns alphabetically
    setcolorder(pge_dt, order(names(pge_dt)))
    # Save
    saveRDS(object = pge_dt,
      file = paste0(dir_rds, "Prices/PGE/prices_pge_", the_zip, ".rds"))

    # # Create and save subset ---------------------------------------------------
    # # Define the percentage of observations that we wish to keep
    # pct <- 0.05
    # # Compose set of unique household IDs with their counts; keep households
    # # with at least 12 observations
    # id_dt <- pge_dt[, .N, by = hh_id][N > 12]
    # # Sample 'pct' percent of the IDs (rounding up to nearest integer)
    # ids_sampled <- sample(
    #   x = id_dt[,hh_id],
    #   size = ceiling(nrow(id_dt) * pct))
    # # Grab the subset of sampled households from bill_dt
    # sample_dt <- pge_dt[hh_id %in% ids_sampled]
    # # Save the sampled dataset
    # saveRDS(object = sample_dt,
    #   file = paste0(dir_sample, "pge", the_zip, ".rds"))

    # The end ------------------------------------------------------------------
  }
  # Kill cluster
  stopCluster(cl)
  # Clean up
  gc()

# Data notes -------------------------------------------------------------------
#   - In the 2009-2015 subset of PG&E and SoCalGas, all tariff and allowance
#     changes occur on the first of the month.
#   - The prefix "H" for PG&E rate schedules implies smart metering.
#   - Other rate schedule codings (PG&E)
#       * G-1 – Residential Service
#       * GM – Master-Metered Multifamily Service
#       * GS – Multifamily Service
#       * GT – Mobilehome Park Service
#       * G1-NGV – Residential Natural Gas Service for Compression
#         on Customers' Premises
#       * GL-1 – Residential CARE Program Service
#       * GML – Master-Metered Multifamily CARE Program Service
#       * GSL – Multi family CARE Program Service
#       * GTL – Mobilehome Park CARE Program Service
#       * GL1-NGV – Residential CARE Program Natural Gas Service for
#         Compression on Customers' Premises.
#     source: http://www.pge.com/nots/rates/tariffs/tm2/pdf/GAS_3281-G.pdf
#   - SoCalGas uses GR for standard consumer rates and GRL for CARE
#     source: https://www.socalgas.com/regulatory/tariffs/tm2/pdf/3501.pdf
